Code
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)
library(ggrepel)Libraries
library(tidyverse)
library(patchwork)
library(ggbeeswarm)
library(ggnewscale)
library(RColorBrewer)
library(hexbin)
library(ggExtra)
library(ggrepel)Paths
metadata_path <-
"data/processed/metadata_ashton_desj_all_weavepop_H99.csv"
chromosomes_path <-
"../Crypto_Desjardins/results/04.Intermediate_files/03.References/chromosome_lengths.tsv"
depth_by_chrom_good_desj_path <-
"../Crypto_Desjardins/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
depth_by_chrom_good_ashton_path <-
"../Crypto_Ashton/results/04.Intermediate_files/02.Dataset/depth_quality/depth_by_chrom_good.tsv"
cnv_calls_path <-
"../Crypto_Desjardins_Ashton/results/02.Dataset/cnv_0.6/cnv_calls.tsv"
chrom_metrics_out_path <-
"results/tables/chrom_metrics.tsv"
chrom_metrics_filtered_out_path <-
"results/tables/chrom_metrics_filtered.tsv"Use the metadata table that has all the samples included in the final Crypto_Desjardins_Ashton dataset (n = 1055) and H99.
Select needed columns, remove H99 and get the number of samples per dataset and lineage, per lineage, and total.
metadata <- read.delim(
metadata_path,
header=TRUE,
sep=",",
stringsAsFactor = TRUE)
metadata <- metadata %>%
select(sample, strain, source, lineage, dataset, vni_subdivision)%>%
filter(!strain == "H99") %>%
group_by(dataset, lineage)%>%
mutate(samples_in_dataset_lineage = n_distinct(sample))%>%
ungroup() %>%
group_by(lineage)%>%
mutate(samples_in_lineage = n_distinct(sample))%>%
ungroup()%>%
mutate(total_samples = n_distinct(sample))%>%
droplevels()Get the nice chromosome names from the chromosomes file in WeavePop.
chromosomes = read.delim(
chromosomes_path,
header=TRUE, sep=",") %>%
mutate(chromosome = str_pad(chromosome, 2, pad = "0"))%>%
mutate(chromosome = as.factor(chromosome))
levels(chromosomes$chromosome) <- paste("chr", chromosomes$chromosome, sep="")
chromosomes <- chromosomes %>%
arrange(lineage, length)Lineage VNII is the only one in which chromosome names coincide with the length (chr01 largest, chr14 smallest).
Import the file with all called CNVs and add to it the chromosome information.
cnv_calls <- read.delim(
cnv_calls_path,
header=TRUE, sep="\t")%>%
left_join(chromosomes, by="accession")Create the depth thresholds used to identify deletions and duplications.
depth_threshold <- 0.6
del_threshold <- 1 - depth_threshold
dup_threshold <- 1 + depth_thresholdCreate dataframe for the called duplications only dupCNVs.
dup_calls <- filter(cnv_calls, cnv == "duplication")To filter out the dupCNVs that have very large Normalized Depth. Here I make groups of CNVs that start at the same position in the same chromosome.
dup_groups <- dup_calls %>%
group_by(lineage, chromosome, start, end, region_size) %>%
summarize(max_depth = max(norm_depth),
median_depth = median(norm_depth),
n = n(),
median_repeat = median(repeat_fraction))The dupCNVs groups with max depth larger than extreme_depth:
extreme_depth <- 5large_dups <- dup_groups %>%
filter((max_depth > extreme_depth))%>%
arrange(desc(n))
head(large_dups)| lineage | chromosome | start | end | region_size | max_depth | median_depth | n | median_repeat |
|---|---|---|---|---|---|---|---|---|
| VNI | chr01 | 339501 | 350500 | 11000 | 9.73 | 6.140 | 489 | 0.00 |
| VNI | chr02 | 274001 | 281500 | 7500 | 115.79 | 50.645 | 428 | 0.63 |
| VNI | chr02 | 274501 | 281000 | 6500 | 100.90 | 45.920 | 359 | 0.57 |
| VNI | chr10 | 582001 | 587000 | 5000 | 6.48 | 4.770 | 151 | 0.51 |
| VNI | chr03 | 1533001 | 1537000 | 4000 | 18.36 | 6.960 | 117 | 0.08 |
| VNI | chr01 | 1 | 2000 | 2000 | 32.38 | 5.410 | 108 | 0.00 |
Filter out duplications that are part of the groups in the previous table.
This way of filtering removes individual CNVs that not necessarily have depth grater than 5 but other samples in the lineage have a high depth CNV in the same region.
dup_calls_filter_depth <- dup_calls %>%
filter(!paste(start, end, chromosome, lineage) %in%
paste(large_dups$start, large_dups$end, large_dups$chromosome, large_dups$lineage))repeat_threshold = 0.25repetitive_dups <- dup_groups %>%
filter((median_repeat > 0.25))%>%
arrange(desc(n))dup_calls_filtered <- dup_calls_filter_depth %>%
filter(!paste(start, end, chromosome, lineage) %in%
paste(repetitive_dups$start, repetitive_dups$end,
repetitive_dups$chromosome, repetitive_dups$lineage))depth_by_chrom_good_desjardins <- read.delim(
depth_by_chrom_good_desj_path,
header=TRUE, sep="\t")
depth_by_chrom_good_ashton <- read.delim(
depth_by_chrom_good_ashton_path,
header=TRUE, sep="\t")
depth_by_chrom <- rbind(depth_by_chrom_good_desjardins, depth_by_chrom_good_ashton)%>%
select(sample, accession, norm_chrom_mean, norm_chrom_median)%>%
left_join(chromosomes, by = "accession")
head(depth_by_chrom)| sample | accession | norm_chrom_mean | norm_chrom_median | lineage | chromosome | length |
|---|---|---|---|---|---|---|
| SRS404518 | CP003820.1 | 0.97 | 0.98 | VNI | chr01 | 2291499 |
| SRS404518 | CP003821.1 | 1.12 | 0.98 | VNI | chr02 | 1621675 |
| SRS404518 | CP003822.1 | 1.01 | 0.99 | VNI | chr03 | 1575141 |
| SRS404518 | CP003823.1 | 0.99 | 1.00 | VNI | chr04 | 1084805 |
| SRS404518 | CP003824.1 | 0.98 | 0.99 | VNI | chr05 | 1814975 |
| SRS404518 | CP003825.1 | 0.99 | 1.00 | VNI | chr06 | 1422463 |
The following dataframes contain a row per individual called duplication with the information about the CNV and the chromosome of the sample. For the chromosome-sample without any called duplication, the columns related to CNVs have NAs.
all_metrics is created with all dupCNVs. all_metrics_filtered with the dupCNVs filtered by depth and repeat fraction.
all_metrics <- left_join(depth_by_chrom, dup_calls,
by = c("sample", "lineage", "accession", "chromosome", "length"))%>%
left_join(metadata, by = c("sample", "lineage"))
head(all_metrics)| sample | accession | norm_chrom_mean | norm_chrom_median | lineage | chromosome | length | start | end | cnv | region_size | depth | norm_depth | smooth_depth | repeat_fraction | overlap_bp | feature_id | strain | source | dataset | vni_subdivision | samples_in_dataset_lineage | samples_in_lineage | total_samples |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| SRS404518 | CP003820.1 | 0.97 | 0.98 | VNI | chr01 | 2291499 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003821.1 | 1.12 | 0.98 | VNI | chr02 | 1621675 | 274501 | 281000 | duplication | 6500 | 4049.51 | 37.85 | 37.32 | 0.57 | 3735 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 | |
| SRS404518 | CP003822.1 | 1.01 | 0.99 | VNI | chr03 | 1575141 | 1533001 | 1537000 | duplication | 4000 | 1246.32 | 11.64 | 5.32 | 0.08 | 312 | CNAG_06925,CNAG_06926 | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003823.1 | 0.99 | 1.00 | VNI | chr04 | 1084805 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003824.1 | 0.98 | 0.99 | VNI | chr05 | 1814975 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
| SRS404518 | CP003825.1 | 0.99 | 1.00 | VNI | chr06 | 1422463 | NA | NA | NA | NA | NA | NA | NA | NA | NA | NA | Bt134 | Clinical | Desjardins | VNIa-5 | 185 | 853 | 1055 |
all_metrics_filtered <- left_join(depth_by_chrom, dup_calls_filtered,
by = c("sample", "lineage", "accession", "chromosome", "length"))%>%
left_join(metadata, by = c("sample", "lineage"))And keep the rest of the chromosome metrics.
chrom_metrics <- all_metrics %>%
group_by(sample, strain, lineage,
accession, chromosome, length,
source, vni_subdivision, dataset,
samples_in_lineage, samples_in_dataset_lineage,
total_samples,
norm_chrom_mean, norm_chrom_median)%>%
summarise(total_cnv_size = sum(region_size, na.rm = TRUE),
n_cnvs = sum(!is.na(cnv)),
first = min(start),
last = max(end),
mean_cnv_depth = round(mean(norm_depth),2),
median_cnv_depth = round(median(norm_depth),2),
repeat_size = sum(overlap_bp)) %>%
ungroup()%>%
mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
dup_span_size = ifelse(is.na(last - first), 0, last - first),
dup_span_percent = round((dup_span_size / length) * 100, 2),
dup_repeat_percent = round((repeat_size / length) * 100, 2),
chromosome = as.factor(chromosome),
dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))
head(chrom_metrics)| sample | strain | lineage | accession | chromosome | length | source | vni_subdivision | dataset | samples_in_lineage | samples_in_dataset_lineage | total_samples | norm_chrom_mean | norm_chrom_median | total_cnv_size | n_cnvs | first | last | mean_cnv_depth | median_cnv_depth | repeat_size | dup_coverage_percent | dup_span_size | dup_span_percent | dup_repeat_percent |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| ERS1142690 | 20949_2#1 | VNI | CP003820.1 | chr01 | 2291499 | Clinical | VNIa-4 | Ashton | 853 | 668 | 1055 | 1.00 | 0.98 | 11000 | 1 | 339501 | 350500 | 7.12 | 7.12 | 0 | 0.48 | 10999 | 0.48 | 0.00 |
| ERS1142690 | 20949_2#1 | VNI | CP003821.1 | chr02 | 1621675 | Clinical | VNIa-4 | Ashton | 853 | 668 | 1055 | 1.20 | 0.96 | 6500 | 1 | 274501 | 281000 | 64.92 | 64.92 | 3735 | 0.40 | 6499 | 0.40 | 0.23 |
| ERS1142690 | 20949_2#1 | VNI | CP003822.1 | chr03 | 1575141 | Clinical | VNIa-4 | Ashton | 853 | 668 | 1055 | 0.99 | 0.98 | 0 | 0 | NA | NA | NA | NA | NA | 0.00 | 0 | 0.00 | NA |
| ERS1142690 | 20949_2#1 | VNI | CP003823.1 | chr04 | 1084805 | Clinical | VNIa-4 | Ashton | 853 | 668 | 1055 | 1.00 | 1.00 | 0 | 0 | NA | NA | NA | NA | NA | 0.00 | 0 | 0.00 | NA |
| ERS1142690 | 20949_2#1 | VNI | CP003824.1 | chr05 | 1814975 | Clinical | VNIa-4 | Ashton | 853 | 668 | 1055 | 0.97 | 0.98 | 0 | 0 | NA | NA | NA | NA | NA | 0.00 | 0 | 0.00 | NA |
| ERS1142690 | 20949_2#1 | VNI | CP003825.1 | chr06 | 1422463 | Clinical | VNIa-4 | Ashton | 853 | 668 | 1055 | 1.00 | 1.00 | 0 | 0 | NA | NA | NA | NA | NA | 0.00 | 0 | 0.00 | NA |
chrom_metrics_filtered <- all_metrics_filtered %>%
group_by(sample, strain, lineage,
accession, chromosome, length,
source, vni_subdivision, dataset,
samples_in_lineage, samples_in_dataset_lineage,
total_samples,
norm_chrom_mean, norm_chrom_median)%>%
summarise(total_cnv_size = sum(region_size, na.rm = TRUE),
n_cnvs = sum(!is.na(cnv)),
first = min(start),
last = max(end),
mean_cnv_depth = round(mean(norm_depth),2),
median_cnv_depth = round(median(norm_depth),2),
repeat_size = sum(overlap_bp)) %>%
ungroup()%>%
mutate(dup_coverage_percent = round((total_cnv_size / length) * 100, 2),
dup_span_size = ifelse(is.na(last - first), 0, last - first),
dup_span_percent = round((dup_span_size / length) * 100, 2),
dup_repeat_percent = round((repeat_size / length) * 100, 2),
chromosome = as.factor(chromosome),
dup_coverage_percent = ifelse(is.na(dup_coverage_percent), 0, dup_coverage_percent),
dup_span_percent = ifelse(is.na(dup_span_percent), 0, dup_span_percent))The total number of chromosomes in all samples in the dataset is: 14770
The total number of chromosomes in all samples with called duplications is: 5058
The total number of chromosomes in all samples with filtered called duplications is: 1790
Create file with chrom_metrics without any filtering of original CNVs.
Create file with chrom_metrics_filtered with filters in the original CNVs.
write_tsv(chrom_metrics, chrom_metrics_out_path)
write_tsv(chrom_metrics_filtered, chrom_metrics_filtered_out_path)cnv_calls %>%
group_by(cnv) %>%
summarize(
count = n(),
min_norm_depth = min(norm_depth, na.rm = TRUE),
mean_norm_depth = mean(norm_depth, na.rm = TRUE),
q25_norm_depth = quantile(norm_depth, probs = 0.25, na.rm = TRUE),
median_norm_depth = median(norm_depth, na.rm = TRUE),
q75_norm_depth = quantile(norm_depth, probs = 0.75, na.rm = TRUE),
max_norm_depth = max(norm_depth, na.rm = TRUE),
min_smooth_depth = min(smooth_depth, na.rm = TRUE),
mean_smooth_depth = mean(smooth_depth, na.rm = TRUE),
q25_smooth_depth = quantile(smooth_depth, probs = 0.25, na.rm = TRUE),
median_smooth_depth = median(smooth_depth, na.rm = TRUE),
q75_smooth_depth = quantile(smooth_depth, probs = 0.75, na.rm = TRUE),
max_smooth_depth = max(smooth_depth, na.rm = TRUE),
min_region_size = min(region_size, na.rm = TRUE),
mean_region_size = mean(region_size, na.rm = TRUE),
q25_region_size = quantile(region_size, probs = 0.25, na.rm = TRUE),
median_region_size = median(region_size, na.rm = TRUE),
q75_region_size = quantile(region_size, probs = 0.75, na.rm = TRUE),
max_region_size = max(region_size, na.rm = TRUE),
)| cnv | count | min_norm_depth | mean_norm_depth | q25_norm_depth | median_norm_depth | q75_norm_depth | max_norm_depth | min_smooth_depth | mean_smooth_depth | q25_smooth_depth | median_smooth_depth | q75_smooth_depth | max_smooth_depth | min_region_size | mean_region_size | q25_region_size | median_region_size | q75_region_size | max_region_size |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| deletion | 55594 | 0 | 0.079619 | 0.00 | 0.01 | 0.09 | 7.45 | 0.00 | 0.0747786 | 0.00 | 0.03 | 0.12 | 0.39 | 3 | 12935.77 | 5475 | 9000 | 17000 | 121500 |
| duplication | 9409 | 0 | 6.624390 | 1.66 | 1.79 | 2.47 | 115.79 | 1.61 | 6.4827144 | 1.64 | 1.74 | 2.13 | 115.79 | 244 | 14108.14 | 3000 | 5867 | 11000 | 726500 |
There are a lot more deletions than duplications.
d <- ggplot(cnv_calls, aes(repeat_fraction, fill = cnv)) +
facet_wrap(~cnv, ncol = 1, scales = "free")+
geom_histogram(binwidth = 0.01)+
scale_x_continuous(labels = scales::comma)+
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Repeat Fraction of CNVs",
subtitle = paste("Binwidth: 0.01"),
y = "Count",
x = "Repeat Fraction")
d In deletions, more of them have high repetitive fraction. In duplications, most of them have low repetitive fraction.
The Normalized depth (norm_depth) is the median of the normalized mean depth of the windows that are part of the CNV region.
The Smooth normalized depth (smooth_depth) is the median of the smooth normalized mean depth of the windows that are part of the CNV region.
d <- ggplot(cnv_calls, aes(norm_depth, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 0.1)+
scale_y_log10(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Normalized Depth",
y = "Count (Log 10)",
x = "Normalized Depth")
s <- ggplot(cnv_calls, aes(smooth_depth, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 0.1)+
scale_y_log10(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Smooth Normalized Depth",
y = "Count (Log 10)",
x = "Smooth Normalized Depth")
d | sThere are CNVs with extremely high normalized depth. I will truncate the normalized depth to 5.
d <- ggplot(cnv_calls, aes(norm_depth, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 0.01)+
geom_vline(xintercept = del_threshold)+
geom_vline(xintercept = dup_threshold)+
scale_y_log10(labels = scales::comma) +
xlim(0,5)+
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Normalized Depth",
subtitle = paste("Lines in depth threshold:", depth_threshold),
y = "Count (Log 10)",
x = "Normalized Depth (truncated)")
s <- ggplot(cnv_calls, aes(smooth_depth, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 0.01)+
geom_vline(xintercept = del_threshold)+
geom_vline(xintercept = dup_threshold)+
scale_y_log10(labels = scales::comma) +
xlim(0,5)+
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Smooth Normalized Depth",
subtitle = paste("Lines in depth threshold:", depth_threshold),
y = "Count (Log 10)",
x = "Smooth Normalized Depth (truncated)")
d | sMost deletions have depth (normalized and smooth) close to 0. Most duplications have depth (normalized and smooth) close to the depth threshold with which CNVs were called.
p <- ggplot(cnv_calls, aes(x = norm_depth, y = smooth_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
labs(title = "Hexbin Plot of Normalized Depth vs Smooth Normalized Depth",
subtitle = "Panels by interval of Repeat Fraction",
x = "Normalized Depth",
y = "Smooth Normalized Depth")
o <- ggplot(cnv_calls, aes(x = norm_depth, y = smooth_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_x_continuous(labels = scales::comma, limits = c(0,5)) +
scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
labs(title = "Hexbin Plot of Normalized Depth vs Smooth Normalized Depth (Truncated axes)",
subtitle = "Panels by interval of Repeat Fraction",
x = "Normalized Depth",
y = "Smooth Normalized Depth")
p / oSince the CNVs are called using the smooth_depth, but it does not always coincide with the norm_depth, there are called CNVs in between the depth threshold used in the CNV calling. Most CNVs are in the diagonal where smooth depth is the same as normalized depth.
There are many duplications with extremely high depth that are in the same high interval of repetitive fraction.
There are more deletion in the high repetitive fraction interval. There are more duplications in the low repetitive fraction interval.
d <- ggplot(cnv_calls, aes(region_size, fill = cnv)) +
facet_wrap(~cnv, ncol = 1)+
geom_histogram(binwidth = 1000)+
scale_x_continuous(labels = scales::comma)+
scale_y_log10(labels = scales::comma) +
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Histogram of Sizes of CNVs",
y = "Count (Log10)",
x = "Size of CNVs")
d Only duplication are larger than ~150kb.
f <- ggplot(dup_calls_filtered, aes(y = region_size, x = length, color = chromosome)) +
geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
geom_point(alpha = 0.3)+
scale_y_continuous(labels = scales::comma) +
# facet_wrap(~lineage, ncol = 1)+
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Distribution of Size of\nFiltered dupCNVs by Length of Chromosome",
y = "Size of CNVs",
x = "Length of chromosome")
u <- ggplot(dup_calls, aes(y = region_size, x = length, color = chromosome)) +
geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
geom_point(alpha = 0.3)+
scale_y_continuous(labels = scales::comma) +
# facet_wrap(~lineage, ncol = 1)+
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Distribution of Size of\nall dupCNVs by Length of Chromosome",
y = "Size of CNVs",
x = "Length of chromosome")
u | fThe dupCNVs are larger in smaller chromosomes.
p <- ggplot(cnv_calls, aes(x = region_size, y = smooth_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
labs(title = "Hexbin Plot of Smooth Normalized Depth vs CNV Size",
subtitle = "Panels by interval of Repeat Fraction",
x = "CNV Size",
y = "Smooth Normalized Depth")
o <- ggplot(cnv_calls, aes(x = region_size, y = smooth_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
labs(title = "Hexbin Plot of Smooth Normalized Depth vs CNV Size (Truncated Y axis)",
subtitle = "Panels by interval of Repeat Fraction",
x = "CNV Size",
y = "Smooth Normalized Depth")
p / oThere are many small CNVs in a high repetitive fraction interval that have extremely high depth.
p <- ggplot(cnv_calls, aes(x = region_size, y = norm_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma) +
theme_minimal() +
labs(title = "Hexbin Plot of Normalized Depth vs CNV Size",
subtitle = "Panels by interval of Repeat Fraction",
x = "CNV Size",
y = "Normalized Depth")
o <- ggplot(cnv_calls, aes(x = region_size, y = norm_depth)) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
geom_hex(bins = 50) +
facet_wrap(~cut(repeat_fraction, breaks = 4), nrow = 2) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
scale_x_continuous(labels = scales::comma) +
scale_y_continuous(labels = scales::comma, limits = c(0,5)) +
theme_minimal() +
labs(title = "Hexbin Plot of Normalized Depth vs CNV Size (Truncated Y axis)",
subtitle = "Panels by interval of Repeat Fraction",
x = "CNV Size",
y = "Normalized Depth")
p / oMostly, the CNVs with very high smooth_depth are small.
The CNVs whose norm_depth is in between the depth thresholds are small. (These are the ones that the norm_depth and smooth_depth don’t coincide).
Some dupCNVs are large and have very high depth (the ones that are neither in the vertical cluster of small CNVs and high depth or the horizontal cluster of large CNVs and depth around 2):
filter(cnv_calls, region_size > 50000, smooth_depth > 2.5)%>%
select(sample, lineage, chromosome, norm_depth, smooth_depth, region_size, repeat_fraction)| sample | lineage | chromosome | norm_depth | smooth_depth | region_size | repeat_fraction |
|---|---|---|---|---|---|---|
| SRS885892 | VNBI | chr04 | 4.39 | 4.51 | 212000 | 0.03 |
| SRS417676 | VNI | chr04 | 2.83 | 2.84 | 52000 | 0.08 |
| SRS417676 | VNI | chr09 | 3.20 | 3.46 | 190000 | 0.02 |
| ERS2541126 | VNI | chr13 | 2.50 | 2.58 | 129000 | 0.12 |
| ERS2541274 | VNI | chr13 | 2.53 | 2.58 | 97500 | 0.09 |
| ERS542397 | VNI | chr04 | 2.76 | 2.76 | 704500 | 0.03 |
| ERS2541331 | VNI | chr04 | 3.76 | 3.78 | 82000 | 0.01 |
| ERS2541212 | VNI | chr13 | 2.66 | 2.64 | 129000 | 0.12 |
| ERS542490 | VNI | chr04 | 3.23 | 3.24 | 344000 | 0.05 |
| ERS2541064 | VNI | chr06 | 5.10 | 5.16 | 65000 | 0.02 |
The CNVs with the highest Normalized Depth (smooth or not), are highly repetitive.
There are many delCNVs completely repetitive. The majority of the dupCNVs are not repetitive.
Distribution of repeat fraction in dupCNVs after filtering out very high depth groups of CNVs.
n <- ggplot(dup_calls_filter_depth, aes(x = chromosome, y = norm_depth, color = chromosome)) +
geom_quasirandom()+
facet_wrap(~cut(repeat_fraction, breaks = 4), ncol = 1) +
theme_minimal() +
ylim(0,5)+
theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
labs(title = "Distribution of Normalized Depth of dupCNVs",
subtitle = "Panels by interval of Repeat Fraction",
y = "Normalized Depth",
x = "Chromosome")
s <- ggplot(dup_calls_filter_depth, aes(x = chromosome, y = smooth_depth, color = chromosome)) +
geom_quasirandom()+
facet_wrap(~cut(repeat_fraction, breaks = 4), ncol = 1) +
theme_minimal() +
ylim(0,5)+
theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
labs(title = "Distribution of Smooth Normalized Depth of dupCNVs",
subtitle = "Panels by interval of Repeat Fraction",
y = "Smooth Normalized Depth",
x = "Chromosome")
n | sDistribution of repeat fraction in dupCNVs after filtering out very high depth groups of CNVs and CNVs with repeat fraction above repeat_treshold.
n <- ggplot(dup_calls_filtered, aes(x = chromosome, y = norm_depth, color = chromosome)) +
geom_quasirandom()+
theme_minimal() +
ylim(0,5)+
theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
labs(title = "Distribution of Normalized Depth of dupCNVs",
y = "Normalized Depth",
x = "Chromosome")
s <- ggplot(dup_calls_filtered, aes(x = chromosome, y = smooth_depth, color = chromosome)) +
geom_quasirandom()+
theme_minimal() +
ylim(0,5)+
theme(legend.position = "none", axis.text.x = element_text(angle = 90))+
labs(title = "Distribution of Smooth Normalized Depth of dupCNVs",
y = "Smooth Normalized Depth",
x = "Chromosome")
n | sThe Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized). The Normalized mean depth of chromosomes (norm_chrom_mean) is the normalized mean of the depth of the positions in the whole chromosome. (First the mean was calculated and then normalized).
med <- ggplot(chrom_metrics)+
geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Count",
x = "Normalized median depth of chromosomes")
mea <- ggplot(chrom_metrics)+
geom_histogram(aes(norm_chrom_mean), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
theme_minimal() +
labs(title = "Histogram of Normalized mean depth of chromosomes",
y = "Count",
x = "Normalized mean depth of chromosomes")
med / meamed <- ggplot(filter(chrom_metrics, norm_chrom_median > 1.2 ))+
geom_histogram(aes(norm_chrom_median), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_median), by = 0.1)) +
scale_y_continuous(
breaks = seq(0, max(table(round(chrom_metrics$norm_chrom_median ))) , 1)
)+
theme_minimal() +
theme(panel.grid.minor = element_blank())+
labs(title = "Histogram of Normalized median depth of chromosomes",
subtitle = "Only values above 1.2",
y = "Count",
x = "Normalized median depth of chromosomes")
mea <- ggplot(filter(chrom_metrics, norm_chrom_mean > 1.2 ))+
geom_histogram(aes(norm_chrom_mean), binwidth = 0.01)+
scale_x_continuous(breaks = seq(0,max(chrom_metrics$norm_chrom_mean), by = 0.1)) +
#scale_y_continuous(breaks = seq(0, max(table(round(chrom_metrics$norm_chrom_mean ))) , 1))+
theme_minimal() +
theme(panel.grid.minor = element_blank())+
labs(title = "Histogram of Normalized mean depth of chromosomes",
subtitle = "Only values above 1.2",
y = "Count",
x = "Normalized mean depth of chromosomes")
med / meaChromosomes with very high median depth:
chrom_metrics %>%
select(lineage, sample, strain, chromosome, norm_chrom_median, norm_chrom_mean)%>%
filter(norm_chrom_median > 2.2)%>%
arrange(norm_chrom_median)| lineage | sample | strain | chromosome | norm_chrom_median | norm_chrom_mean |
|---|---|---|---|---|---|
| VNI | ERS1142697 | 20427_2#6 | chr12 | 2.24 | 2.20 |
| VNI | ERS2541126 | BMD3144 | chr13 | 2.24 | 2.23 |
| VNBI | SRS885841 | NRHc5009.REL.INI | chr14 | 2.25 | 2.23 |
| VNI | SRS404481 | Bt139 | chr13 | 2.27 | 2.20 |
| VNI | SRS885916 | PMHc1031A.ENR.INI.LP | chr12 | 2.31 | 2.34 |
| VNI | ERS2541212 | 04CN-03-039 | chr13 | 2.45 | 2.39 |
| VNBII | SRS881180 | PMHc1029.ENR.STOR | chr12 | 2.45 | 2.42 |
| VNI | ERS542397 | 14936_1#6 | chr04 | 2.74 | 2.34 |
u <- ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = length, y = n_cnvs, color = chromosome))+
geom_point(alpha = 0.1)+
geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Number of dupCNVs\nper Chromosome by Length",
y = "Number of dupCNVs",
x = "Length of Chromosome (bp)")
f <- ggplot(filter(chrom_metrics_filtered, n_cnvs > 0), aes(x = length, y = n_cnvs, color = chromosome))+
geom_point(alpha = 0.1)+
geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
theme_minimal() +
labs(title = "Number of filtered dupCNVs\nper Chromosome by Length",
y = "Number of dupCNVs",
x = "Length of Chromosome (bp)")
u | fLarger chromosomes have more dupCNVs
n_per_chrom <- chrom_metrics %>%
filter(n_cnvs > 0)%>%
group_by(chromosome, lineage, length)%>%
summarize(n = n())
n_per_chrom_filtered <- chrom_metrics_filtered %>%
filter(n_cnvs > 0)%>%
group_by(chromosome, lineage, length)%>%
summarize(n = n())
u <- ggplot(n_per_chrom, aes(x = length, y = n, color = chromosome))+
geom_col()+
geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
theme_minimal() +
theme(legend.position = "none")+
labs(title = "Number of Chromosomes with CNVs by Length",
y = "Number of Chromosomes",
x = "Length of Chromosome (bp)")
f <- ggplot(n_per_chrom_filtered, aes(x = length, y = n, color = chromosome))+
geom_col()+
geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
theme_minimal() +
labs(title = "Number of Chromosomes with filtered CNVs by Length",
y = "Number of Chromosomes",
x = "Length of Chromosome (bp)")
u | fggplot(filter(chrom_metrics, n_cnvs > 0))+
geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent, color = chromosome), alpha = 0.1)+
theme_minimal() +
labs(title = "Percent of Chromosome Covered by dupCNVs per Chromosome",
subtitle = "Only chromosomes with at least 1 dupCNV",
y = "Percent of Chromosome Covered by dupCNVs",
x = "Chromosome")p <- ggplot(filter(chrom_metrics, dup_coverage_percent > 10))+
geom_quasirandom(aes(x = chromosome, y = dup_coverage_percent))+
theme_minimal() +
labs(title = "Percent of Chromosome Covered by dupCNVs per Chromosome",
subtitle = "Only chromosomes with at least 10% covered",
x = "Chromosome",
y = "Percent of Chromosome Covered by dupCNVs")
pp <- ggplot(filter(chrom_metrics, n_cnvs > 0))+
geom_quasirandom(aes(x = chromosome, y = dup_span_percent))+
theme_minimal() +
labs(title = "Percent of Chromosome Spanned by dupCNVs per Chromosome",
subtitle = "Only chromosomes with at least 1 dupCNV",
y = "Percent of Chromosome Spanned by dupCNVs",
x = "Chromosome")
pp <- ggplot(filter(chrom_metrics, dup_coverage_percent > 10))+
geom_quasirandom(aes(x = chromosome, y = dup_span_percent))+
theme_minimal() +
labs(title = "Percent of Chromosome Spanned by dupCNVs per Chromosome",
subtitle = "Only chromosomes with at least 10% spanned",
y = "Percent of Chromosome Spanned by dupCNVs",
x = "Chromosome")
ggMarginal(p, type = "histogram")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = chromosome))+
geom_boxplot()+
geom_hline(yintercept =1, color = "black", linetype = "solid")+
geom_hline(yintercept =2, color = "black", linetype = "solid")+
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Count",
x = "Normalized median depth of chromosomes")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = chromosome, color = lineage))+
geom_boxplot()+
geom_hline(yintercept =1, color = "black", linetype = "solid")+
geom_hline(yintercept =2, color = "black", linetype = "solid")+
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Count",
x = "Normalized median depth of chromosomes")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = length, color = chromosome))+
geom_point(alpha = 0.05)+
facet_wrap(~lineage, ncol = 1)+
geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
# geom_hline(yintercept =1, color = "black", linetype = "solid")+
# geom_hline(yintercept =2, color = "black", linetype = "solid")+
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Normalized median depth of chromosomes",
x = "Chromosome length")ggplot(chrom_metrics, aes(y = norm_chrom_median, x = length, color = chromosome))+
geom_point(alpha = 0.05)+
geom_smooth(method = "lm", se = TRUE, color = "blue", linetype = "dashed") +
geom_hline(yintercept =1, color = "black", linetype = "solid")+
geom_hline(yintercept =2, color = "black", linetype = "solid")+
theme_minimal() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Normalized median depth of chromosomes",
x = "Chromosome length")ggplot(filter(chrom_metrics, lineage == "VNI"), aes(y = norm_chrom_mean, x = length, color = chromosome))+
geom_boxplot()+
# geom_hline(yintercept =1, color = "black", linetype = "solid")+
# geom_hline(yintercept =2, color = "black", linetype = "solid")+
theme_classic() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Count",
x = "Normalized median depth of chromosomes")ggplot(filter(chrom_metrics, lineage == "VNI"), aes(y = norm_chrom_mean, x = length))+
geom_hex(bins = 50)+
geom_hline(yintercept =1, color = "black", linetype = "solid")+
geom_hline(yintercept =2, color = "black", linetype = "solid")+
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_classic() +
labs(title = "Histogram of Normalized median depth of chromosomes",
y = "Count",
x = "Normalized median depth of chromosomes")ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_hex(bins=50) +
scale_x_continuous(labels = scales::comma) +
geom_abline(slope = 1, intercept = 0, linetype = "solid", color = "lightgray") +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Spanned by dupCNVs vs Covered by dupCNVs",
y = "Percent of Chromosome Spanned by dupCNVs",
x = "Percent of Chromosome Covered by dupCNVs")ggplot(filter(chrom_metrics, n_cnvs > 0), aes(x = dup_coverage_percent, y = n_cnvs)) +
geom_hex(bins=50) +
scale_x_continuous(labels = scales::comma) +
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Number of CNVs vs Percent of Chromosome Covered by dupCNVs ",
y = "Number of CNVs",
x = "Percent of Chromosome Covered by dupCNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = n_cnvs)) +
scale_x_continuous(labels = scales::comma) +
scale_color_distiller(palette = "GnBu", direction = -1, trans = "log2", name = "Number\n of CNVs") +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = norm_chrom_median)) +
scale_x_continuous(labels = scales::comma) +
scale_color_viridis_c(name = "Normalized Median\nDepth of Chromosome", direction = -1) +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = dup_span_percent)) +
geom_point(aes(color = norm_chrom_mean)) +
facet_wrap(~cut(norm_chrom_median, breaks = 6), nrow = 2) +
scale_x_continuous(labels = scales::comma) +
scale_color_viridis_c(name = "Normalized Mean\nDepth of Chromosome", direction = -1) +
theme_minimal() +
theme(legend.position = "right")+
labs(title = "Percent of Chromosome Covered by CNVs vs Spanned by CNVs",
y = "Percent of Chromosome Spanned by CNVs",
x = "Percent of Chromosome Covered by CNVs")The Normalized depth (norm_depth) is the median of the normalized depth of the windows that are part of each dupCNV region.
The Median depth of dupCNV (median_cnv_depth) is the median of the Normalized depth of all dupCNVs in the chromosome. There are values bellow the threshold of depth to call a Duplication CNV because that threshold was applied to the smoothed values and this are the raw values.
The Normalized median depth of chromosomes (norm_chrom_median) is the normalized median of the depth of the positions in the whole chromosome. (First the median was calculated and then normalized).
The Percent of Chromosome Covered by dupCNVs (dup_coverage_percent) is the percentage of the full length of the chromosome that is part of called dupCNVs.
The Percent of Chromosome Spanned by dupCNVs (dup_span_percent) is the percentage of the full length of the chromosome that is in between the leftmost and rightmost dupCNVs.
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_hex(bins = 50)+
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by dupCNVs")ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_mean)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_hex(bins = 50)+
scale_fill_viridis_c(name = "Count\nLog10", direction = -1, trans = "log10") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
y = "Normalized Mean Depth of Chromosome",
x = "Percent of Chromosome Covered by dupCNVs")color_range <- paste("2.6 -", max(chrom_metrics$median_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_median)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(median_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Median dupCNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
y = "Normalized Median Depth of Chromosome",
x = "Percent of Chromosome Covered by dupCNVs")color_range <- paste("2.6 -", max(chrom_metrics$mean_cnv_depth, na.rm = TRUE))
colors <- c("#FDE725", "#1F9E89", "#440154")
names(colors) <-c("0 - 1.6", "1.6 - 2.6", color_range)
ggplot(chrom_metrics, aes(x = dup_coverage_percent, y = norm_chrom_mean)) +
geom_hline(yintercept = c(1, 2), color = "black", linetype = "dashed") +
geom_point(aes(color = cut(mean_cnv_depth,
breaks = c(-Inf, 1.6, 2.6, Inf),
labels = c("0 - 1.6", "1.6 - 2.6", color_range))),
alpha = 0.5) +
scale_color_manual(values = colors,
name = "Mean dupCNV Depth") +
theme_minimal() +
theme(legend.position = "right") +
labs(title = "Percent of Chromosome Covered by dupCNVs vs. Depth of Chromosome",
y = "Normalized Mean Depth of Chromosome",
x = "Percent of Chromosome Covered by dupCNVs")